{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# note:\n",
    "# to import functions form fuzzy_matching.py, need to add if __name__ == '__main__': to the end of the file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "import pickle\n",
    "from functools import partial\n",
    "import polars as pl\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "import datetime as dt\n",
    "from concurrent.futures import ProcessPoolExecutor\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "\n",
    "PATH_OUTPUT = '' # curated data\n",
    "PATH_DATA = '../data/' # aggregate data (included in this package)\n",
    "\n",
    "sys.path.append(os.path.abspath('..'))\n",
    "import tools\n",
    "\n",
    "sys.path.append(\"../\")\n",
    "\n",
    "pd.set_option('display.max_columns', 500)\n",
    "pd.set_option('display.max_rows', 1000)\n",
    "\n",
    "pl.Config.set_tbl_rows(1000)  # Adjust the number to match or exceed your row count\n",
    "pl.Config.set_tbl_cols(500)  # Adjust the number to match or exceed your column count\n",
    "pl.Config.set_fmt_str_lengths(100);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# load data\n",
    "\n",
    "names_tech = tools.clean_company_name(\n",
    "    pl.read_csv(PATH_DATA + 'tech_companies_ca.csv', columns=['company_name'])\n",
    "    .unique()\n",
    ")\n",
    "    \n",
    "names_indeed = (\n",
    "    pl.read_parquet(PATH_OUTPUT + 'indeed_all_jobs.parquet', columns=['company_name'])\n",
    "    .filter(pl.col('company_name').is_not_null())\n",
    "    .unique()\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Data Preprocessing\n",
    "\n",
    "all_names = pl.concat([names_tech, names_indeed], how='vertical')\n",
    "\n",
    "unique_names = all_names.unique().to_series().to_list()\n",
    "\n",
    "preprocess_mapper = pl.DataFrame(\n",
    "    {\n",
    "        \"company_name\": unique_names,\n",
    "        \"cleaned_name\": [\n",
    "            tools.replace_stopwords(name).lower().replace(\" \", \"\")\n",
    "            for name in unique_names\n",
    "            ]\n",
    "    }\n",
    ")\n",
    "\n",
    "df_tech = names_tech.join(preprocess_mapper, how='left', on='company_name')\n",
    "df_indeed = names_indeed.join(preprocess_mapper, how='left', on='company_name')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "NGRAMS = 3\n",
    "\n",
    "def ngrams(string):\n",
    "    ngrams = zip(*[string[i:] for i in range(NGRAMS)])\n",
    "    return [''.join(ngram) for ngram in ngrams]\n",
    "\n",
    "\n",
    "print('fitting vectorizer')\n",
    "all_names_cleaned = (\n",
    "    all_names\n",
    "    .join(preprocess_mapper, on='company_name', how='left')\n",
    "    ['cleaned_name']\n",
    "    .to_list()\n",
    ")\n",
    "vectorizer = TfidfVectorizer(min_df=2, max_df=0.2, analyzer=ngrams)\n",
    "vectorizer.fit(all_names_cleaned)\n",
    "\n",
    "with open(PATH_OUTPUT + 'vectorizer_tech.pickle', 'wb') as handle:\n",
    "    pickle.dump(vectorizer, handle)\n",
    "\n",
    "with open(PATH_OUTPUT + 'vectorizer_tech.pickle', 'rb') as handle:\n",
    "    vectorizer = pickle.load(handle)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# MATCHING\n",
    "\n",
    "def find_matches(names_source, tfidfs_target):\n",
    "    tfidfs_source = vectorizer.transform(names_source)  # important for speed that names_source is list and not a series\n",
    "    mat_similarity = cosine_similarity(tfidfs_source, tfidfs_target)\n",
    "    indices = [i for i in mat_similarity.argmax(axis=1)]\n",
    "    scores = mat_similarity.max(axis=1)\n",
    "    return (indices, scores)\n",
    "\n",
    "def match_names(names_source, names_target):\n",
    "    tfidfs_target = vectorizer.transform(names_target)\n",
    "    indices, scores = find_matches(names_source, tfidfs_target)\n",
    "    matches = [names_target[index] for index in indices]\n",
    "    return matches, scores\n",
    "\n",
    "names_source = df_indeed['cleaned_name'].unique().to_list()\n",
    "names_target = df_tech['cleaned_name'].unique().to_list()\n",
    "\n",
    "def create_dict_matches(names_source, names_target):\n",
    "    matches, scores = match_names(names_source, names_target)\n",
    "    d = {\n",
    "        'name': names_source,\n",
    "        'match': matches,\n",
    "        'score': scores\n",
    "        }\n",
    "    return d\n",
    "\n",
    "df_matching = pl.DataFrame(create_dict_matches(names_source, names_target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# tech firms matches with indeed\n",
    "\n",
    "df_matching_tech_firms = (\n",
    "    df_matching\n",
    "    .with_columns((pl.col('score') > 1-1e-6).alias('tech'))  # only keep perfect matches\n",
    "    .drop('score')\n",
    ")\n",
    "\n",
    "df_matching_tech_firms.write_csv(PATH_OUTPUT + 'matching_tech_firms.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all jobs indeed with tech flag if tech company\n",
    "\n",
    "indeed_all_jobs_tech_flag = (\n",
    "    pl.read_parquet(PATH_OUTPUT + 'indeed_all_jobs.parquet')\n",
    "    .join(preprocess_mapper, how='left', on='company_name')\n",
    "    .join(\n",
    "        df_matching_tech_firms,\n",
    "        how='left',\n",
    "        left_on='cleaned_name',\n",
    "        right_on='name'\n",
    "        )\n",
    "\n",
    "    )\n",
    "indeed_all_jobs_tech_flag.write_csv(PATH_OUTPUT + 'indeed_all_jobs_tech_flag.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Opening and merging data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Open NAICS and NOC data, tech firms, and indices\n",
    "noc = pl.read_csv(PATH_OUTPUT + 'jobpostingsclass.csv')\n",
    "\n",
    "naics = pl.read_parquet(\n",
    "    PATH_OUTPUT + 'df_matches_all_naics.parquet',\n",
    "    columns=['job_key', 'date_first_visible', 'last_date', 'modal_naics']\n",
    ")\n",
    "\n",
    "tech = pl.read_csv(\n",
    "    PATH_OUTPUT + 'indeed_all_jobs_tech_flag.csv',\n",
    "    columns=['job_key', 'date_first_visible', 'match', 'tech']\n",
    ")\n",
    "\n",
    "indices = pl.read_csv(PATH_DATA + 'index_digital.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Note: I will take the number of job postings in the occupation data. This eliminates the duplicate ones on job_key, date_first_visible and job_title.\n",
    "print(noc.shape)\n",
    "print(naics.shape)\n",
    "print(tech.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert date_first_visible to date format in all DataFrames before merging\n",
    "# noc\n",
    "noc = noc.with_columns(\n",
    "    pl.col('date_first_visible').str.strptime(pl.Date, fmt='%Y-%m-%d')\n",
    ")\n",
    "print(\"Min date noc: \", noc['date_first_visible'].min())\n",
    "print(\"Max date noc: \", noc['date_first_visible'].max())\n",
    "# naics (already date format)\n",
    "print(\"Min date naics: \", naics['date_first_visible'].min())\n",
    "print(\"Max date naics: \", naics['date_first_visible'].max())\n",
    "# tech\n",
    "tech = tech.with_columns(\n",
    "    pl.col('date_first_visible').str.strptime(pl.Date, fmt='%Y-%m-%d')\n",
    ")\n",
    "print(\"Min date tech: \", tech['date_first_visible'].min())\n",
    "print(\"Max date tech: \", tech['date_first_visible'].max())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Merge noc with naics\n",
    "df = noc.join(\n",
    "    naics,\n",
    "    on=['job_key', 'date_first_visible'],\n",
    "    how='left'\n",
    ")\n",
    "\n",
    "# Merge with tech\n",
    "df = df.join(\n",
    "    tech,\n",
    "    on=['job_key', 'date_first_visible'],\n",
    "    how='left'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# rename for matching with indices\n",
    "df=df.rename({'NOC':'noc', 'modal_naics': 'naics'})\n",
    "print(df.columns)\n",
    "print(df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge with indices\n",
    "df = df.join(\n",
    "    indices,\n",
    "    on=['noc'],\n",
    "    how='left'\n",
    ")\n",
    "df = df.select([col for col in df.columns if not pl.Series(col).str.contains('Unnamed').any()])\n",
    "print(df.shape)\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Dates\n",
    "df = df.with_columns(pl.col('date_first_visible').cast(pl.Date)) # it's already date format, but just in case\n",
    "print(df['date_first_visible'].max())   # check it's the most updated data\n",
    "\n",
    "# Extract year and month and add as new columns\n",
    "df = df.with_columns([\n",
    "    pl.col('date_first_visible').dt.year().alias('year'),\n",
    "    pl.col('date_first_visible').dt.month().alias('month')\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function for frequency tables\n",
    "def freq_tab(df: pl.DataFrame, column: str) -> pl.DataFrame:\n",
    "    \"\"\"\n",
    "    Create a frequency table with counts and percentages for a specified column.\n",
    "\n",
    "    Parameters:\n",
    "        df (pl.DataFrame): The input DataFrame.\n",
    "        column (str): The name of the column to tabulate.\n",
    "\n",
    "    Returns:\n",
    "        pl.DataFrame: A DataFrame with unique values, counts, and percentages.\n",
    "    \"\"\"\n",
    "    # Calculate total row count\n",
    "    total_count = df.shape[0]\n",
    "    \n",
    "    # Calculate frequency counts and percentages\n",
    "    return (\n",
    "        df.groupby(column)\n",
    "        .agg(\n",
    "            pl.count().alias('frequency')  # Count the occurrences\n",
    "        )\n",
    "        .with_columns(\n",
    "            (pl.col('frequency') / total_count * 100).alias('percentage')  # Calculate percentages\n",
    "        )\n",
    "    )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Function for cross tabulation with row percentage\n",
    "def cross_tab_percentage(df: pl.DataFrame, var1: str, var2: str) -> pl.DataFrame:\n",
    "    \"\"\"\n",
    "    Create a cross-tabulation of var1 and var2 with percentages as a pivot table.\n",
    "\n",
    "    Parameters:\n",
    "        df (pl.DataFrame): The input DataFrame.\n",
    "        var1 (str): The column to group by (row variable).\n",
    "        var2 (str): The column to pivot (column variable).\n",
    "\n",
    "    Returns:\n",
    "        pl.DataFrame: A pivot table with var2 values as columns and percentages.\n",
    "    \"\"\"\n",
    "    # Get unique values of var1 and var2\n",
    "    unique_var1 = df.select(pl.col(var1)).unique()\n",
    "    unique_var2 = df.select(pl.col(var2)).unique()\n",
    "\n",
    "    # Create a Cartesian join (cross) of all combinations of var1 and var2\n",
    "    all_combinations = unique_var1.join(unique_var2, how=\"cross\")\n",
    "\n",
    "    # Group by var1 and var2, and count occurrences\n",
    "    grouped = (\n",
    "        df.groupby([var1, var2])\n",
    "        .agg(\n",
    "            pl.count().alias('frequency')  # Count occurrences\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Perform a left join with all combinations to ensure every pair exists\n",
    "    cross_tab = all_combinations.join(grouped, on=[var1, var2], how=\"left\")\n",
    "\n",
    "    # Fill missing frequencies with 0\n",
    "    cross_tab = cross_tab.with_columns(\n",
    "        pl.col('frequency').fill_null(0)\n",
    "    )\n",
    "\n",
    "    # Calculate total counts for each level of var1\n",
    "    totals = (\n",
    "        cross_tab.groupby(var1)\n",
    "        .agg(\n",
    "            pl.col('frequency').sum().alias('total')\n",
    "        )\n",
    "    )\n",
    "\n",
    "    # Join totals back to the cross-tab\n",
    "    cross_tab = cross_tab.join(totals, on=var1)\n",
    "\n",
    "    # Calculate percentages\n",
    "    cross_tab = cross_tab.with_columns(\n",
    "        (pl.col('frequency') / pl.col('total') * 100).alias('percentage')\n",
    "    )\n",
    "\n",
    "    # Pivot the table to have var2 values as columns\n",
    "    pivot_table = cross_tab.pivot(\n",
    "        values=\"percentage\",\n",
    "        index=var1,\n",
    "        columns=var2,\n",
    "        aggregate_function=\"first\"  # Use 'first' as counts are unique per combination\n",
    "    )\n",
    "\n",
    "    return pivot_table"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# fill the nulls to strings to avoid problems later\n",
    "df = df.with_columns(\n",
    "    pl.col('digital').fill_null('unknown')  # Replace null values with 'unknown'\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DIGITAL: 0 \"Not enabled infrastructure\" 1 \"Enabled infrastructure\"\n",
    "df=df.with_columns(\n",
    "    pl.when(pl.col('digital') == 'Enabled Infra')\n",
    "    .then(1)\n",
    "    .otherwise(0)\n",
    "    .alias('digital')\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Digital jobs\n",
    "print(freq_tab(df,'digital'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tech companies\n",
    "print(freq_tab(df,'tech'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Filter tech companies\n",
    "temp_new = df.filter(pl.col('tech') == 1)\n",
    "\n",
    "# Group by company_name, count the occurrences of job_title, and sort in descending order\n",
    "result = (\n",
    "    temp_new\n",
    "    .groupby(\"company_name\")\n",
    "    .agg(pl.col(\"job_title\").count().alias(\"job_title_count\"))\n",
    "    .sort(\"job_title_count\", descending=True)\n",
    ")\n",
    "\n",
    "# Show the result\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Tech companies by sectors\n",
    "print(cross_tab_percentage(df, 'naics', 'tech'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NAICS by group of tech non-tech, at 3-digits NAICS\n",
    "df = df.with_columns(\n",
    "    pl.col('naics')\n",
    "    .cast(pl.Utf8)\n",
    "    .str.slice(0,3)\n",
    "    .alias('naics_3d')\n",
    ")\n",
    "print(cross_tab_percentage(df, 'naics_3d', 'tech'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# NAICS by group of tech non-tech, at 2-digits NAICS\n",
    "df = df.with_columns(\n",
    "    pl.col('naics')\n",
    "    .cast(pl.Utf8)\n",
    "    .str.slice(0,2)\n",
    "    .alias('naics_2d')\n",
    ")\n",
    "print(cross_tab_percentage(df, 'naics_2d', 'tech'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Transformations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Creating the new tech variable:\n",
    "# 1- Includes all tech firms\n",
    "# 2- Includes sectors where these firms are more represented; these are:\n",
    "naics_tech = [811211]\n",
    "naics_3d_tech = [\"443\"]\n",
    "#naics_2d_tech = [\"55\", \"54\", \"52\", \"51\"]\n",
    "\n",
    "df = df.with_columns(\n",
    "    pl.when(\n",
    "        pl.col('naics').is_in(naics_tech) |\n",
    "        pl.col('naics_3d').is_in(naics_3d_tech) #|\n",
    "        #pl.col('naics_2d').is_in(naics_2d_tech)\n",
    "    )\n",
    "    .then(True)\n",
    "    .otherwise(pl.col('tech').fill_null(False))\n",
    "    .alias('tech1')\n",
    ")\n",
    "print(freq_tab(df,'tech1'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# some descriptives\n",
    "cross_tab_percentage(df,'tech1','digital')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# creating the digital tech variable\n",
    "df = df.with_columns(pl.lit(None).alias('digtech'))\n",
    "df = df.with_columns(\n",
    "    pl.when((pl.col('digital') == 0) & (pl.col('tech1') == 0))\n",
    "    .then(0)  # no digi, no tech\n",
    "    .when((pl.col('digital') == 0) & (pl.col('tech1') == 1))\n",
    "    .then(1)  # no digi, tech\n",
    "    .when((pl.col('digital') == 1) & (pl.col('tech1') == 0))\n",
    "    .then(2)  # digi, no tech\n",
    "    .when((pl.col('digital') == 1) & (pl.col('tech1') == 1))\n",
    "    .then(3)  # digi, tech\n",
    "    .otherwise(None)\n",
    "    .alias('digtech')\n",
    ")\n",
    "print(freq_tab(df, 'digtech'))\n",
    "\n",
    "# creating the digital tech variable--restricted version\n",
    "df = df.with_columns(pl.lit(None).alias('digtech0'))\n",
    "df = df.with_columns(\n",
    "    pl.when((pl.col('digital') == 0) & (pl.col('tech') == 0))\n",
    "    .then(0)  # no digi, no tech\n",
    "    .when((pl.col('digital') == 0) & (pl.col('tech') == 1))\n",
    "    .then(1)  # no digi, tech\n",
    "    .when((pl.col('digital') == 1) & (pl.col('tech') == 0))\n",
    "    .then(2)  # digi, no tech\n",
    "    .when((pl.col('digital') == 1) & (pl.col('tech') == 1))\n",
    "    .then(3)  # digi, tech\n",
    "    .otherwise(None)\n",
    "    .alias('digtech0')\n",
    ")\n",
    "print(freq_tab(df, 'digtech0'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Digital - Tech"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transform the data to pandas\n",
    "df_pandas = df.to_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Aggregation\n",
    "# cleaning \n",
    "temp = df_pandas[['date_first_visible', 'digital', 'tech', 'tech1', 'digtech', 'digtech0']].copy()\n",
    "\n",
    "# date format\n",
    "temp['date_first_visible'] = pd.to_datetime(temp['date_first_visible'])\n",
    "\n",
    "# aggregation\n",
    "d_weekly = {}\n",
    "\n",
    "# digtech\n",
    "temp1 = temp[['date_first_visible', 'digtech']]\n",
    "for digtech in temp1['digtech'].dropna().unique().tolist():\n",
    "    print(digtech)\n",
    "    temp2 = temp1.loc[temp1['digtech'] == digtech]\n",
    "    temp2.set_index('date_first_visible', inplace=True)\n",
    "    d_weekly[digtech] = temp2.resample('W-Mon', label='left', closed='left').count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Concatenate data\n",
    "df_weekly = pd.concat(d_weekly.values(), axis=1)\n",
    "df_weekly.columns = d_weekly.keys()\n",
    "df_weekly.reset_index(inplace=True)\n",
    "df_weekly.rename(columns={0:'no_digi_no_tech',1:'no_digi_tech',2:'digi_no_tech',3:'digi_tech'}, inplace=True)\n",
    "print(df_weekly.head())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Transformations to the data\n",
    "df_weekly[['digi_no_tech_ma','digi_tech_ma','no_digi_no_tech_ma','no_digi_tech_ma']]=df_weekly[['digi_no_tech','digi_tech','no_digi_no_tech','no_digi_tech']].transform(lambda x:x.rolling(3).mean())\n",
    "df2019 = df_weekly[(df_weekly['date_first_visible']>='2019-01-01')&(df_weekly['date_first_visible']<'2019-12-31')] # 2019 \n",
    "df_weekly['digi_no_tech_2019']=(df_weekly['digi_no_tech_ma']/df2019['digi_no_tech'].mean())*100\n",
    "df_weekly['digi_tech_2019']=(df_weekly['digi_tech_ma']/df2019['digi_tech'].mean())*100\n",
    "df_weekly['no_digi_no_tech_2019']=(df_weekly['no_digi_no_tech_ma']/df2019['no_digi_no_tech'].mean())*100\n",
    "df_weekly['no_digi_tech_2019']=(df_weekly['no_digi_tech_ma']/df2019['no_digi_tech'].mean())*100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Select only indices\n",
    "df_weekly_index = df_weekly[['date_first_visible', 'digi_no_tech_2019', 'digi_tech_2019',\n",
    "       'no_digi_no_tech_2019', 'no_digi_tech_2019']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Saving indices\n",
    "df_weekly_index.to_csv(PATH_DATA + \"weekly_tech_indexedto2019.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.11.2 ('env_indeed')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "vscode": {
   "interpreter": {
    "hash": "85864727f28aee7f8b5cbbe8b85f993d9af676cc8fd20e8f70c7dc86bdcf9c39"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
